#R Script - for "The .GOV Internet Archive: A Big Data Resource for Political Science", doi:10.7910/DVN/YINHYL, Harvard Dataverse
#Emily Kalah Gade
#28 Feb 2017

#load lib

library(gplots)
library(np)
library(xlsx)
library(entropy)
library(ggplot2)
library(tile)
library(reshape)
library(dplyr)
require(RColorBrewer)
col <- brewer.pal(9, "Set1")

setwd("/Users/emilykalahgade/Desktop/dataversefiles/")

############ Figure 2 is a barplot made in Excel using this set of code/data: #####

##In Pig:
#Checksum = LOAD '/dataset/gov/url-ts-checksum/' USING PigStorage() AS (surt:chararray, date:chararray, checksum:chararray);
#ChecksumWithYear = FOREACH Checksum GENERATE surt as url, SUBSTRING(date, 0, 4) as year, checksum;
#GroupYear = GROUP ChecksumWithYear BY year;       
#YearCounts = FOREACH GroupYear GENERATE group as year, COUNT(ChecksumWithYear);
#dump YearCounts

#Dumped count results
# (1995,232)
# (1996,565466)
# (1997,6038427)
# (1998,2968886)
# (1999,5900947)
# (2000,21256643)
# (2001,40764774)
# (2002,32558174)
# (2003,60201849)
# (2004,173514421)
# (2005,119806344)
# (2006,242944266)
# (2007,107780464)
# (2008,175803829)
# (2009,550597560)
# (2010,547109256)
# (2011,229008331)
# (2012,216388067)
# (2013,121877560)


############ Figure 4 - Issue Attention ##################

d<-read.csv("Figure4_IssueAttention.csv", stringsAsFactors=F, header=T)

traceA <- linesTile(x=d$Year, y= d$Terrorism, col=col[2],  plot=1)# terrorism
traceB <- linesTile(x=d$Year, y= d$Finance, col=col[3], lty=2, plot=1)# finance
traceC <- linesTile(x=d$Year, y= d$Climate, col=col[4], plot=1)# climate
legendTraceA <- textTile(labels=c("Terrorism Terms"),
                         x=c(2010),
                         y=c(.00017),
                         col=col[2],
                         plot=1)
legendTraceB <- textTile(labels=c("Financial Crisis Terms"),
                         x=c(2010),
                         y=c(.00011),
                         col=col[3],
                         plot=1)
legendTraceC <- textTile(labels=c("Climate Change Terms"),
                         x=c(2010),
                         y=c(.000055),
                         col=col[4],
                         plot=1)
l.tile <- tile(traceA, 
               traceB, traceC,
               legendTraceA, legendTraceB, legendTraceC, 
               output = list(wide=6.5, outfile="Figure1_NO_smoothing.pdf", type="pdf"),
               RxC=c(1, 1),
               limits = c(2000,2012,0,.00025),
               #    width=list(null=2),
               xaxistitle = list(labels="Year"),
               yaxistitle = list(labels="Term Proportion"),
               #   maintitle = list(labels=c("Issue Attention across .GOV")),
               #   gridlines = list(type="xy"),
               frame = TRUE)




############ Entropy Figure - 5  ####################


#load datat

data <- read.csv("entropy.csv", stringsAsFactors=F, header=T)
data<-na.omit(data)

head(data, 100)
dat<-cbind(unique(data$URL), rep(NA))
names(dat)<-c("URLs", "entropy")
head(dat)
ent=list()
urls<-unique(data$URL)
#dat$urls<-urls
years<-seq(2000, 2014, by=1)
for (i in 1:length(years)){
  #ent[i]<- entropy(data$twordsprop[data$Year==years[i]], 
  ent[i]<- entropy(data$prop[data$Year==years[i]],
                   method=c("ML"))  
}
ent<-as.numeric(ent)
dat<-cbind(years, ent)
dat<-as.data.frame(dat)


traceA <- linesTile(x=dat$years, y= dat$ent, col=col[2], lwd = 8, plot=1)# terrorism
legendTraceA <- textTile(labels=c("Terrorism Terms"),
                         x=c(2004),
                         y=c(1.3),
                         col=col[2],
                         plot=1)

l.tile <- tile(traceA, 
               #legendTraceA, 
               output = list(wide=6.5, outfile="Entropy_Figure_v3_NoSmoothing", type="pdf"),
               RxC=c(1, 1),
               limits = c(2000,2012,2.3,2.8),
               #    width=list(null=2),
               xaxistitle = list(labels="Year"),
               yaxistitle = list(labels="Entropy of Terrorism Attention"),
              # maintitle = list(labels=c("Diffusion of Terrorism Terms across .GOV")),
               #gridlines = list(type="xy"),
               frame = TRUE)


############# Figure 6 ##############


d2<- read.xlsx("Figure6.xls", 1, stringsAsFactors=F, header=T)
d2<-d2[, 1:7]
names(d2)<- c("year", "fdic", "fed", "sec", "treasu", "whitehouse", "congress")

ifelse(is.na(d2$year), 0, d2$year) -> d2$year
ifelse(is.na(d2$fdic), 0, d2$fdic) -> d2$fdic
ifelse(is.na(d2$fed), 0, d2$fed) -> d2$fed
ifelse(is.na(d2$sec), 0, d2$sec) -> d2$sec
ifelse(is.na(d2$treasu), 0, d2$treasu) -> d2$treasu
ifelse(is.na(d2$whitehouse), 0, d2$whitehouse) -> d2$whitehouse
ifelse(is.na(d2$congress), 0, d2$congress) -> d2$congress

traceA <- linesTile(x=d2$year, y= smooth(d2$fdic), col=col[1], lty=2,  plot=1)# FDIC
traceB <- linesTile(x=d2$year, y= smooth(d2$fed), col=col[2], plot=1)# FED
traceC <- linesTile(x=d2$year, y= smooth(d2$sec), col=col[3], plot=1)# SEC
traceD <- linesTile(x=d2$year, y= smooth(d2$treasu), col=col[4], plot=1)# Tresuary
traceE <- linesTile(x=d2$year, y= smooth(d2$whitehouse), col=col[5],  plot=1)# WhiteHouse
traceF <- linesTile(x=d2$year, y= smooth(d2$congress), col=col[7],  lty=5, plot=1)# Congress
legendTraceA <- textTile(labels=c("FDIC"),
                         x=c(2011),
                         y=c(.000008),
                         col=col[1],
                         plot=1)
legendTraceB <- textTile(labels=c("Federal Reserve"),
                         x=c(2010),
                         y=c(.000052),
                         col=col[2],
                         plot=1)
legendTraceC <- textTile(labels=c("SEC"),
                         x=c(2009),
                         y=c(.000003),
                         col=col[3],
                         plot=1)

legendTraceD <- textTile(labels=c("Treasury"),
                         x=c(2006),
                         y=c(.000036),
                         col=col[4],
                         plot=1)
legendTraceE <- textTile(labels=c("White House"),
                         x=c(2008),
                         y=c(.000012),
                         col=col[5],
                         plot=1)
legendTraceF <- textTile(labels=c("Congress"),
                         x=c(2011),
                         y=c(.000012),
                         col=col[7],
                         plot=1)

l.tile <- tile(traceA, 
               traceB, traceC, 
               traceD, traceE, traceF,
               legendTraceA, legendTraceB, legendTraceC, 
               legendTraceD, legendTraceE, legendTraceF, 
               output = list(wide=6.5, outfile="FinacialCrisis_GOV_smoothing", type="pdf"),
               RxC=c(1, 1),
               limits = c(2000,2012,0,.00008),
               #    width=list(null=2),
               xaxistitle = list(labels="Year"),
               yaxistitle = list(labels="Term Proportion of 'Bubble'"),
             #   maintitle = list(labels=c("Attention to the Financial Crisis by Agency")),
            #   gridlines = list(type="xy"),
               frame = TRUE)



############## Figure 7 #####################


#load data


d2<- read.xlsx("Figure7.xls", 1, stringsAsFactors=F, header=T)
names(d2)<- c("year", "congress", "whitehouse", "agencies")
head(d2)
str(d2)
d<-d2

traceA <- linesTile(x=d$year, y= smooth(d$congress), col=col[2], lwt=2,  plot=1)# congress
traceB <- linesTile(x=d$year, y= smooth(d$whitehouse), col=col[3], plot=1)# whitehouse
traceC <- linesTile(x=d$year, y= smooth(d$agencies), col=col[4], plot=1)# agencies
legendTraceA <- textTile(labels=c("Congress"),
                         x=c(2005),
                         y=c(.53),
                         col=col[2],
                         plot=1)
legendTraceB <- textTile(labels=c("White House"),
                         x=c(2008),
                         y=c(.63),
                         col=col[3],
                         plot=1)
legendTraceC <- textTile(labels=c("Agencies"),
                         x=c(2011),
                         y=c(.85),
                         col=col[4],
                         plot=1)
l.tile <- tile(traceA, 
               traceB, traceC,
               legendTraceA, legendTraceB, legendTraceC, 
               output = list(wide=6.5, outfile="Figure_3_SMOOTHING", type="pdf"),
               RxC=c(1, 1),
               limits = c(2000,2012,0.3,1),
               #    width=list(null=2),
               xaxistitle = list(labels="Year"),
               yaxistitle = list(labels="Climate Change versus Global Warming Emphasis"),
               #  maintitle = list(labels=c("Attention to Climate Change and Global Warming across .GOV")),
               # gridlines = list(type="xy"),
               frame = TRUE)




############### Checking out raw data ##############

data <- read.csv("RawData.csv", comment.char="#", sep=",")

data$year_Month<-paste(data$Year, data$Month, sep = "-", collapse = NULL)
data$year_Month<-as.Date(paste(data$year_Month, "-01", sep="", collapse=NULL))

data2<-aggregate(data$Count, by= list(data$Year, data$Month, data$year_Month, data$URLs, data$Word), FUN= sum)

head(data2)
names(data2)<-c("Year", "Month", "year_Month" ,"URLs", "Word", "Count")
df<-data2

# URL root changed over course of data
df$URLs[df$URLs=="defense.gov"]<-"dod.gov" 


#filter out totals over different time spans

totals<-filter(df, df$Word== "total")
totals1<-aggregate(totals$Count, by= list(totals$year_Month, totals$Year, totals$Month), FUN= sum)
colnames(totals1)<-c("year_Month", "Year", "Month", "Totals")
totals2<-aggregate(totals$Count, by= list(totals$Year,  totals$URLs), FUN= sum)
colnames(totals2)<-c( "Year", "URLs", "Totals")


#get years
totals3<-aggregate(totals$Count, by= list(totals$Year), FUN= sum)
colnames(totals3)<-c( "Year", "Totals")

# select words and URLS of interest
climatechange1<-filter(df, df$Word %in% c("climate change"))
climatechange2<-filter(df, df$Word %in% c("global warming"))
terrorism1<-filter(df, df$Word %in% c("terrorism"))
mortgage<-filter(df, df$Word %in% c("mortgage*"))
dataClimateChange1_onlyFED<-filter(climatechange1, climatechange1$URLs %in% c(".house.gov", "defense.gov", "doi.gov", "energy.gov", "hhs.gov", "usda.gov", "usdoj.gov", "state.gov", "whitehouse.gov", "treasury.gov", "ed.gov", "dod.gov", "commerce.gov", ".senate.gov", "dhs.gov", "dot.gov"))
dataClimateChange2_onlyFED<-filter(climatechange2, climatechange2$URLs %in% c(".house.gov", "defense.gov", "doi.gov", "energy.gov", "hhs.gov", "usda.gov", "usdoj.gov", "state.gov", "whitehouse.gov", "treasury.gov", "ed.gov", "dod.gov", "commerce.gov", ".senate.gov", "dhs.gov", "dot.gov"))
terrorism1_onlyFED<-filter(terrorism1, terrorism1$URLs %in% c(".house.gov", "defense.gov", "doi.gov", "energy.gov", "hhs.gov", "usda.gov", "usdoj.gov", "state.gov", "whitehouse.gov", "treasury.gov", "ed.gov", "dod.gov", "commerce.gov", ".senate.gov", "dhs.gov", "dot.gov"))
mortgage_onlyFED<-filter(mortgage, mortgage$URLs %in% c(".house.gov", "defense.gov", "doi.gov", "energy.gov", "hhs.gov", "usda.gov", "usdoj.gov", "state.gov", "whitehouse.gov", "treasury.gov", "ed.gov", "dod.gov", "commerce.gov", ".senate.gov", "dhs.gov", "dot.gov"))

#climate change
dataClimateChange1_onlyFED<-aggregate(dataClimateChange1_onlyFED$Count, by= list(dataClimateChange1_onlyFED$Year, dataClimateChange1_onlyFED$URLs), FUN= sum)
colnames(dataClimateChange1_onlyFED)<-c("Year",  "URLs", "ClimateChange")
#global warming
dataGlobalWarming_onlyFED<-aggregate(dataClimateChange2_onlyFED$Count, by= list(dataClimateChange2_onlyFED$Year, dataClimateChange2_onlyFED$URLs), FUN= sum)
colnames(dataGlobalWarming_onlyFED)<-c("Year",  "URLs", "GlobalWarming")
#terrorism
terrorism1_onlyFED<-aggregate(terrorism1_onlyFED$Count, by= list(terrorism1_onlyFED$Year, terrorism1_onlyFED$URLs), FUN= sum)
colnames(terrorism1_onlyFED)<-c("Year",  "URLs", "Terrorism")
#mortgage
mortgage_onlyFED<-aggregate(mortgage_onlyFED$Count, by= list(mortgage_onlyFED$Year, mortgage_onlyFED$URLs), FUN= sum)
colnames(mortgage_onlyFED)<-c("Year",  "URLs", "Mortgage")

totals_onlyFED<-filter(totals2, totals2$URLs %in% c(".house.gov", "defense.gov", "doi.gov", "energy.gov", "hhs.gov", "usda.gov", "usdoj.gov", "state.gov", "whitehouse.gov", "treasury.gov", "ed.gov", "dod.gov", "commerce.gov", ".senate.gov", "dhs.gov", "dot.gov"))

df2_climate<-merge(totals_onlyFED, dataClimateChange1_onlyFED, by.x = intersect(names(totals_onlyFED), names(dataClimateChange1_onlyFED)))#, incomparables = NA)
head(df2_climate)
df2_globalwarming<-merge(totals_onlyFED, dataGlobalWarming_onlyFED, by.x = intersect(names(totals_onlyFED), names(dataGlobalWarming_onlyFED)))#, incomparables = NA)
head(df2_globalwarming)
df2_terrorism<-merge(totals_onlyFED, terrorism1_onlyFED, by.x = intersect(names(totals_onlyFED), names(terrorism1_onlyFED)))#, incomparables = NA)
head(df2_terrorism)
df2_mortgage<-merge(totals_onlyFED, mortgage_onlyFED, by.x = intersect(names(totals_onlyFED), names(mortgage_onlyFED))) #, incomparables = NA)
head(df2_mortgage)

df3<-merge(df2_climate, df2_globalwarming,  by.x=intersect(names(df2_climate), names(df2_globalwarming)))#, incomparables = NA)
df3_1<-merge(df2_terrorism, df2_mortgage,  by.x=intersect(names(df2_terrorism), names(df2_mortgage)))#, incomparables = NA)
head(df3)
df4<-merge(df3, df3_1,  by.x=intersect(names(df3), names(df3_1)))#, incomparables = NA)
head(df4)
df4$frequency_global= df4$GlobalWarming/df4$Totals
df4$frequency_climate= df4$ClimateChange/df4$Totals
df4$frequency_terror= df4$Terrorism/df4$Totals
df4$frequency_mortgage= df4$Mortgage/df4$Totals

#everything together
df_test<-df4
head(df_test)
x<-cbind(df_test$frequency_global, df_test$frequency_climate, df_test$frequency_terror, df_test$frequency_mortgage)
df_agByYear<-aggregate(x, by= list(df_test$Year), FUN= sum)
head(df_agByYear)
names(df_agByYear)<- c("Year", "Global", "CC", "terror", "mortgage")

### more complete word lists
climatechange<-filter(df, df$Word %in% c("climate change", "climategate", "anthropocene", "ocean acidification", "desertification", "anthropogenic", "greenhouse gas", "intergovernmental panel on climate change", "global warming"))
terrorism<-filter(df, df$Word %in% c("terrorism", "known and suspected terror", "bioterror", "ksts"))
fiscalCrisis<-filter(df, df$Word %in% c("lending crisis*", "toxic asset*", "subprime", "housing crisis*", "mortgage*", "bankrupt*"))

#aggregate datasets
data2<-aggregate(fiscalCrisis$Count, by= list(fiscalCrisis$Year, fiscalCrisis$URLs), FUN= sum)
colnames(data2)<-c("Year",  "URLs", "fiscalCrisisCounts")

data3<-aggregate(climatechange$Count, by= list(climatechange$year_Month, climatechange$URLs), FUN= sum) 
colnames(data3)<-c("year_Month", "URLs", "climateChangeCounts")

#depending which set of words you want to look at - below for climate change or terrorism
# data3<-aggregate(terrorism$Count, by= list(terrorism$Year, terrorism$URLs), FUN= sum)
# colnames(data3)<-c("Year", "URLs", "terrorismCounts")
# plot(data3$Year, data3$terrorismCounts,  type = "l")

data3_onlyFED<-filter(data2, data2$URLs %in% c(".house.gov", "defense.gov", "doi.gov", "energy.gov", "hhs.gov", "usda.gov", "usdoj.gov", "state.gov", "whitehouse.gov", "treasury.gov", "ed.gov", "dod.gov", "commerce.gov", ".senate.gov", "dhs.gov", "dot.gov"))

#climate change counts by year
data4<-aggregate(climatechange$Count, by= list(climatechange$Year), FUN= sum)
colnames(data4)<-c("Year", "climatechangeCounts")
#
# data4<-aggregate(terrorism$Count, by= list(terrorism$Year), FUN= sum)
# colnames(data4)<-c("Year", "terrorismCounts")
